## Warning in fun(libname, pkgname): mzR has been built against a different Rcpp version (0.12.10)
## than is installed on your system (0.12.13). This might lead to errors
## when loading mzR. If you encounter such issues, please send a report,
## including the output of sessionInfo() to the Bioc support forum at 
## https://support.bioconductor.org/. For details see also
## https://github.com/sneumann/mzR/wiki/mzR-Rcpp-compiler-linker-issue.
## Warning: replacing previous import 'BiocGenerics::var' by 'stats::var' when
## loading 'MLInterfaces'

TMT labeling efficiency

# # Read data in
# f <- "../rawdata/P164_TMT_10_plex_efficiency_PeptideGroups.txt"
# e <- grepEcols(f, "Abundance: F", split = "\t")
# xf <- readMSnSet2(f, e, sep = "\t")
# save(xf, file = "../rscript/P164_TMT_10_plex_efficiency_PeptideGroups.rda")
load("../rscript/P164_TMT_10_plex_efficiency_PeptideGroups.rda")

# How many high quality peptides are modified
hqp <- sum(fData(xf)$Confidence == "High")
hqpm <- sum(fData(xf)$Confidence == "High" & grepl("TMT", fData(xf)$Modifications))
perc <- round(hqpm / hqp * 100, 0)

Incorporation of the tags was evaluated by the number of peptides with the respective modification in the N-terminal and/or lysine residues1.

The labelling efficiency is: 98%.



Exploration data analysis

Raw data distribution

# # Read data in
# f <- "../rawdata/P164_TMT_10_plex_result_Proteins.txt"
# e <- grepEcols(f, "Abundance F", split = "\t")
# x <- readMSnSet2(f, e, sep = "\t")
# save(x, file = "../rscript/P164_TMT_10_plex_result_Proteins.rda")

# Read saved data
load("../rscript/P164_TMT_10_plex_result_Proteins.rda")

## Data annotation
# create pData
Sample <- gsub("Abundance\\.F1\\.\\d*[C|N]\\.Sample\\.(\\w*\\.\\d)\\.", "\\1", colnames(exprs(x)))
Sample <- gsub("Abundance\\.F1\\.\\d*\\.Sample\\.(\\w*\\.\\d)\\.", "\\1", Sample)
tmp <- data.frame(do.call(rbind, strsplit(Sample, "[.]")))
names(tmp) <- c("Sample", "Replicate")
pData(x) <- tmp

# Rename rows and colomns
sampleNames(x) <- paste(x$Sample, x$Replicate, sep = "_")
featureNames(x) <- fData(x)$Accession

## Number of high confidance proteins
cp <- data.frame(table(fData(x)$Protein.FDR.Confidence))
colnames(cp)[1] <- "Protein.FDR.Confidence"
tmp <- x[fData(x)$Protein.FDR.Confidence != "Low",]

The rawdata file contains 6161 proteins. The protein FDR confidance for these proteins can be:

  • High [<0.01]

  • Medium [<0.05]

  • Low [>0.05]


From now on the analysis will be carry on with high and medium confidance proteins.

The new dataset contains 6073 proteins.

Check for missing values

## Plot missing values
naplot(tmp)

x <- filterNA(tmp, 0)


All the missing values have been removed. The analysis will only includes the proteins with a value for each channel.

The dataset used for the analysis contains 5795 proteins.

Principal component analysis

The protein quantification values have been log2 transformed and a PC analysis has been performed to visualise the data correlation.

x <- log(x,2)
plot2D(t(x), fcol = "Sample", cex = 2, addLegend = "bottomleft")


The follow plot shows the distribution and correlation for every sample, with all the other samples.

library(psych)
## 
## Attaching package: 'psych'
## The following object is masked from 'package:IRanges':
## 
##     reflect
pairs.panels(exprs(x))

The two plots show the data correlation. Control samples are clustering together very well, while the PERK samples seem to be more disperse.


Boxplot

boxplot(exprs(x), col = rep(c("yellow", "blue"), each = 5))

The boxplot shows that the CONTR distributions have very similar median, while the PERK samples show more variability.

Sample normalisation

Samples have been normalised using each channel median.

x <- normalise(x, "diff.median")

Boxplot of normalised samples

boxplot(exprs(x), col = rep(c("yellow", "blue"), each = 5))

### There is a negative point!!!!

 PCA of normalised samples

plot2D(t(x), fcol = "Sample", cex = 2, addLegend = "bottomleft")

After normalisation the PCA plot looks better, with a clear separation on the PC1 of the two samples.

Here you can see the normalised data. This dataset is the one use for the differential expression analysis.

Differential expression analysis (DEA)

To perform the DEA we used limma package2.

library(limma)
library(plotly)

## Differential analysis
## Model matrices
mat <- model.matrix(~ 0 + pData(x)$Sample)
colnames(mat) <- c("CONTR", "PERK")
pData(x)$cmat <- mat

## Functions
dostats <- function(x) {
    cont.matrix <- makeContrasts(diffexp  = PERK - CONTR,
                                 levels   = pData(x)[, "cmat"])
    ## Statistical analysis
    fit <- lmFit(exprs(x), pData(x)[, "cmat"])
    fit2 <- contrasts.fit(fit, cont.matrix)
    fit2 <- eBayes(fit2)
    ## Result tables
    tt1 <- topTable(fit2, adjust.method = "BH", coef = "diffexp", number = Inf)
}

## Statistical analysis
res <- dostats(x)

## Prepare data to be saved
fData(x)$diffexp <- res[featureNames(x), ]
dfr <- cbind(data.frame(exprs(x)),
             descr = fData(x)$Description,
             logFC = fData(x)$diffexp[, "logFC"],
             adj.P.Val = fData(x)$diffexp[, "adj.P.Val"])
# write.csv(dfr, "../analysis/proteins_medium.csv")

plot_ly(dfr, x = ~logFC, y = ~-log10(adj.P.Val), type = "scatter",
        text = ~paste("ID: ", rownames(dfr),
                      "\nDesc: ", descr,
                      "\nFC: ", round(logFC, 2),
                      "\n-log10 adj.Pval: ", round(-log10(adj.P.Val), 3),
                      "\nadjusted pvalue: ", adj.P.Val))

Proteomics vs Transcriptomics

In order to directly compare identified proteins and transcripts, the proteins correspondent transcripts have been identified on UniProt website.

The match was many times 1 protein to many transcripts.

The transcripts and proteins identifiers have been joined based on trascript IDs.

The following table reports the correspondences:

# Load proteomics data with transcripts (file generated by rscript/uniprot.R)
load("../rscript/prot_medium_to_trans.rda")
res[, 3:4] <- round(res[, 3:4], 2)

# Load transcriptomics data
tran <- read.csv("../rawdata/Perk flies (Affy proj 2489M).csv")
tran$regt <- ifelse(tran$FC > 2 & tran$FDR < 0.01, 1,
                   ifelse(tran$FC < -2 & tran$FDR < 0.01, -1, 0))

# Combine the 2 datasets
library(dplyr)
y <- inner_join(tran, res, by = "Transcript.ID")
yf <- full_join(tran, res, by = "Transcript.ID")

# Reorder the columns
y <- y %>% dplyr::select(1, 2, 6, 3:5, 7:9)
yf <- yf %>% dplyr::select(1, 2, 6, 3:5, 7:9)
# Rename column
names(y)[3:9] <- c("Protein.ID", "trans_FC", "trans_FDR", "trans_reg", "prot_FC",
                   "prot_FDR", "prot_reg")
names(yf)[3:9] <- c("Protein.ID", "trans_FC", "trans_FDR", "trans_reg", "prot_FC",
                   "prot_FDR", "prot_reg")

Inner join transcriptomics vs proteomics

The follow table contains only the proteins and transcript IDs common between the proteomics and transcriptomics datasets.

# Table
datatable(y,
          rownames = TRUE,
          filter = "top",
          extensions = 'Buttons',
          options = list(dom = 'Bfrtip',
                         buttons = c('csv', 'copy'))
          )


While here it is possible to check the proteins/transcripts regulations:

  • Up regulation: 1

  • No regulation: 0

  • Down regulation: -1

# Compare the proteins/transcipts regulation
tt <- table(y$prot_reg, y$trans_reg)
names(dimnames(tt)) <- c("Proteomics", "Transcriptomics")

# Table
datatable(tt,
          rownames = FALSE,
          #           filter = "top",
          extensions = 'Buttons',
          options = list(dom = 'Bfrtip',
                         buttons = c('csv', 'copy'))
          )
# Compare differentially regulated proteins between LEAVES and ROOTS
y$ind <- 0
for (i in 1:dim(y)[1]){
  if (y$prot_reg[i] > 0) {
    y$ind[i] <- 1
  }
  if (y$prot_reg[i] < 0) {
    y$ind[i] <- 2
  }
  if (y$trans_reg[i] > 0) {
    y$ind[i] <- 3
  }
  if (y$trans_reg[i] < 0) {
    y$ind[i] <- 4
  }
  if (y$prot_reg[i] > 0 & y$trans_reg[i] > 0) {
    y$ind[i] <- 5
  }
  if (y$prot_reg[i] < 0 & y$trans_reg[i] < 0) {
    y$ind[i] <- 6
  }
  if (y$prot_reg[i] > 0 & y$trans_reg[i] < 0) {
    y$ind[i] <- 7
  }
  if (y$prot_reg[i] < 0 & y$trans_reg[i] > 0) {
    y$ind[i] <- 8
  }
}

y <- y %>% arrange((ind))

library(ggthemes)

p1 <- ggplot(y,aes(prot_FC,trans_FC,colour = factor(ind)))
p1 + geom_point(size=1) +
    theme_few() +
    labs(x = "Proteomics FC",
         y = "Transcriptomics FC",
         title = "Compare proteomics-transcriptomics expression") +
    scale_colour_tableau(
    #     scale_colour_manual(values = cbPalette,
                        name = "Differential expressed\nbetween proteins and transcripts",
                        labels=c("Not regulated",
                                 "Up only in proteomics",
                                 "Down only in proteomics",
                                 "Up only in transcriptomics",
                                 "Down only in transcriptomics",
                                 "Up in proteomics and transcriptomicss",
                                 "Down in proteomics and transcriptomics",
                                 "Up in proteomics and down in transcriptomics",
                                 "Down in proteomics and up in transcriptomics"))

Full join transcriptomics vs proteomics

The follow table contains all the transcripts and all the identified proteins.

# Table
datatable(yf,
          rownames = TRUE,
          filter = "top",
          extensions = 'Buttons',
          options = list(dom = 'Bfrtip',
                         buttons = c('csv', 'copy'))
          )


Session Information

sessionInfo()
## R version 3.4.2 (2017-09-28)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Sierra 10.12.6
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] ggthemes_3.4.0       dplyr_0.7.4          bindrcpp_0.2        
##  [4] plotly_4.7.1         ggplot2_2.2.1        limma_3.32.10       
##  [7] psych_1.7.8          pRoloc_1.16.1        MLInterfaces_1.56.0 
## [10] cluster_2.0.6        annotate_1.54.0      XML_3.98-1.9        
## [13] AnnotationDbi_1.38.2 IRanges_2.10.5       S4Vectors_0.14.7    
## [16] DT_0.2               MSnbase_2.2.0        ProtGenerics_1.8.0  
## [19] BiocParallel_1.10.1  mzR_2.10.0           Rcpp_0.12.13        
## [22] Biobase_2.36.2       BiocGenerics_0.22.1  rmarkdown_1.6       
## [25] nvimcom_0.9-34      
## 
## loaded via a namespace (and not attached):
##   [1] backports_1.1.1       plyr_1.8.4            igraph_1.1.2         
##   [4] lazyeval_0.2.0        splines_3.4.2         ggvis_0.4.3          
##   [7] crosstalk_1.0.0       digest_0.6.12         foreach_1.4.3        
##  [10] BiocInstaller_1.26.1  htmltools_0.3.6       viridis_0.4.0        
##  [13] gdata_2.18.0          magrittr_1.5          memoise_1.1.0        
##  [16] doParallel_1.0.11     sfsmisc_1.1-1         recipes_0.1.0        
##  [19] gower_0.1.2           rda_1.0.2-2           dimRed_0.1.0         
##  [22] lpSolve_5.6.13        colorspace_1.3-2      blob_1.1.0           
##  [25] jsonlite_1.5          RCurl_1.95-4.8        hexbin_1.27.1        
##  [28] lme4_1.1-14           genefilter_1.58.1     bindr_0.1            
##  [31] impute_1.50.1         survival_2.41-3       iterators_1.0.8      
##  [34] glue_1.1.1            DRR_0.0.2             gtable_0.2.0         
##  [37] ipred_0.9-6           zlibbioc_1.22.0       MatrixModels_0.4-1   
##  [40] kernlab_0.9-25        ddalpha_1.3.1         prabclus_2.2-6       
##  [43] DEoptimR_1.0-8        scales_0.5.0          vsn_3.44.0           
##  [46] mvtnorm_1.0-6         DBI_0.7               viridisLite_0.2.0    
##  [49] xtable_1.8-2          foreign_0.8-69        bit_1.1-12           
##  [52] proxy_0.4-17          mclust_5.3            preprocessCore_1.38.1
##  [55] lava_1.5.1            prodlim_1.6.1         httr_1.3.1           
##  [58] htmlwidgets_0.9       sampling_2.8          threejs_0.3.1        
##  [61] FNN_1.1               RColorBrewer_1.1-2    fpc_2.1-10           
##  [64] modeltools_0.2-21     pkgconfig_2.0.1       flexmix_2.3-14       
##  [67] nnet_7.3-12           caret_6.0-77          labeling_0.3         
##  [70] rlang_0.1.2           reshape2_1.4.2        munsell_0.4.3        
##  [73] mlbench_2.1-1         tools_3.4.2           RSQLite_2.0          
##  [76] pls_2.6-0             evaluate_0.10.1       stringr_1.2.0        
##  [79] mzID_1.14.0           yaml_2.1.14           ModelMetrics_1.1.0   
##  [82] knitr_1.17            bit64_0.9-7           robustbase_0.92-7    
##  [85] randomForest_4.6-12   purrr_0.2.3           dendextend_1.5.2     
##  [88] nlme_3.1-131          whisker_0.3-2         mime_0.5             
##  [91] RcppRoll_0.2.2        biomaRt_2.32.1        compiler_3.4.2       
##  [94] e1071_1.6-8           affyio_1.46.0         tibble_1.3.4         
##  [97] stringi_1.1.5         lattice_0.20-35       trimcluster_0.1-2    
## [100] Matrix_1.2-11         nloptr_1.0.4          gbm_2.1.3            
## [103] MALDIquant_1.16.4     data.table_1.10.4-2   bitops_1.0-6         
## [106] httpuv_1.3.5          R6_2.2.2              pcaMethods_1.68.0    
## [109] affy_1.54.0           hwriter_1.3.2         gridExtra_2.3        
## [112] codetools_0.2-15      MASS_7.3-47           gtools_3.5.0         
## [115] assertthat_0.2.0      CVST_0.2-1            rprojroot_1.2        
## [118] withr_2.0.0           mnormt_1.5-5          diptest_0.75-7       
## [121] grid_3.4.2            rpart_4.1-11          timeDate_3012.100    
## [124] tidyr_0.7.2           minqa_1.2.4           class_7.3-14         
## [127] shiny_1.0.5           lubridate_1.6.0       base64enc_0.1-3

References

1. Nogueira, F. C. S. et al. Performance of Isobaric and Isotopic Labeling in Quantitative Plant Proteomics. Journal of Proteome Research 11, 3046–3052 (2012).

2. Ritchie ME, W. D., Phipson B & GK, S. Limma powers differential expression analyses for rna-sequencing and microarray studies. Nucleic Acids Research 43, e47 (2015).